{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ccded7e1-f049-4f3d-ab67-2583a8d76075",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-04T22:40:59.166337Z",
     "start_time": "2024-08-04T22:40:59.084921Z"
    }
   },
   "outputs": [],
   "source": [
    "# setting the environment variables\n",
    "import sys\n",
    "import os\n",
    "\n",
    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
    "from config import set_environment\n",
    "set_environment()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3361e98-d544-442e-a2c3-5af7596655e9",
   "metadata": {},
   "source": [
    "## Complex schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "59557cea-949c-4d64-a1bb-87ca6078f94a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-04T22:42:09.245693Z",
     "start_time": "2024-08-04T22:42:09.239623Z"
    }
   },
   "outputs": [],
   "source": [
    "from typing import Optional\n",
    "import pydantic\n",
    "\n",
    "class Experience(pydantic.BaseModel):\n",
    "    start_date: Optional[str]\n",
    "    end_date: Optional[str]\n",
    "    description: Optional[str]\n",
    "\n",
    "class Study(Experience):\n",
    "    degree: Optional[str]\n",
    "    university: Optional[str]\n",
    "    country: Optional[str]\n",
    "    grade: Optional[str]\n",
    "\n",
    "class WorkExperience(Experience):\n",
    "    company: str\n",
    "    job_title: str\n",
    "\n",
    "class Resume(pydantic.BaseModel):\n",
    "    first_name: str\n",
    "    last_name: str\n",
    "    linkedin_url: Optional[str]\n",
    "    email_address: Optional[str]\n",
    "    nationality: Optional[str]\n",
    "    skill: Optional[str]\n",
    "    study: Optional[Study]\n",
    "    work_experience: Optional[WorkExperience]\n",
    "    hobby: Optional[str]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6e9a4137-e093-465f-b187-4f2341dc2ead",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-04T22:42:12.158229Z",
     "start_time": "2024-08-04T22:42:11.312060Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ben/anaconda3/envs/softupdate/lib/python3.10/site-packages/pypdf/_crypt_providers/_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from this module in 48.0.0.\n",
      "  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from langchain_community.document_loaders.pdf import PyPDFLoader\n",
    "\n",
    "pdf_file_path = os.path.expanduser(\"~/Downloads/openresume-resume.pdf\")\n",
    "pdf_loader = PyPDFLoader(pdf_file_path)\n",
    "docs = pdf_loader.load_and_split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3bde73a6-bb19-4dd3-b54f-5a65006faa7d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-04T22:42:33.582220Z",
     "start_time": "2024-08-04T22:42:19.525449Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "{'first_name': 'John',\n 'last_name': 'Doe',\n 'linkedin_url': 'linkedin.com/in/john-doe',\n 'email_address': 'hello@openresume.com',\n 'nationality': None,\n 'skill': 'HTML, TypeScript, CSS, React, Python, C++, Tech: React Hooks, GraphQL, Node.js, SQL, Postgres, NoSql, Redis, REST API, Git, Soft: Teamwork, Creative Problem Solving, Communication, Learning Mindset, Agile',\n 'study': {'start_date': 'Sep 2019',\n  'end_date': 'May 2023',\n  'description': 'Won 1st place in 2022 Education Hackathon, 2nd place in 2023 Health Tech Competition, Teaching Assistant for Programming for the Web (2022 - 2023), Coursework: Object-Oriented Programming (A+), Programming for the Web (A+), Cloud Computing (A), Introduction to Machine Learning (A-), Algorithms Analysis (A-)',\n  'degree': 'Bachelor of Science in Computer Science',\n  'university': 'XYZ University',\n  'country': None,\n  'grade': '3.8 GPA'},\n 'work_experience': {'start_date': 'May 2023',\n  'end_date': 'Present',\n  'description': 'Lead a cross-functional team of 5 engineers in developing a search bar, which enables thousands of daily active users to search content across the entire platform, Create stunning home page product demo animations that drives up sign up rate by 20%, Write clean code that is modular and easy to maintain while ensuring 100% test coverage',\n  'company': 'ABC Company',\n  'job_title': 'Software Engineer'},\n 'hobby': None}"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.output_parsers import PydanticOutputParser\n",
    "from langchain.output_parsers.json import SimpleJsonOutputParser\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.prompts import PromptTemplate\n",
    "\n",
    "parser = PydanticOutputParser(pydantic_object=Resume)\n",
    "prompt = PromptTemplate(\n",
    "    template=\"Extract information from the provided document.\\n{format_instructions}\\n{document}\\n\",\n",
    "    input_variables=[\"document\"],\n",
    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
    ")\n",
    "llm = ChatOpenAI(model_name=\"gpt-4-turbo-preview\")\n",
    "chain = prompt | llm | SimpleJsonOutputParser()  # or parser\n",
    "\n",
    "chain.invoke({\"document\": docs})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9b94cce-7bc3-4128-876a-12e2bdb922ab",
   "metadata": {},
   "source": [
    "## Custom parser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "739a5f32-d730-4486-8ca2-3e85cc4e2baf",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-04T22:41:05.938331Z",
     "start_time": "2024-08-04T22:41:05.656578Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "[{'Document': \"here's a text\"}]"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "import re\n",
    "from langchain_core.messages.ai import AIMessage\n",
    "\n",
    "def extract_json(message: AIMessage) -> list[dict]:\n",
    "    \"\"\"Extract JSON content from a string where JSON is embedded between ```json and ``` tags.\"\"\"\n",
    "    text = message.content\n",
    "    pattern = r\"```json(.*?)```\"\n",
    "    matches = re.findall(pattern, text, re.DOTALL)\n",
    "    try:\n",
    "        return [json.loads(match.strip()) for match in matches]\n",
    "    except Exception:\n",
    "        raise ValueError(f\"Failed to parse: {message}\")\n",
    "\n",
    "extract_json(AIMessage(content=\"\"\"Some ```json { \"Document\": \"here's a text\" } ```\"\"\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "809ba898-8c9f-4546-a15f-e99f73382e2c",
   "metadata": {},
   "source": [
    "Use like this:\n",
    "\n",
    "chain = prompt | llm | extract_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46cd03f2-a91c-4c2d-ab41-9b179edd7d93",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
